Task 3: Experimentation

1. Comparison of Closed Form and Gradient Descent Approaches

Comparisons are made with the text features ignored. The features used here are therefore:

  1. Is root or not
  2. Number of child comments
  3. Controversiality
In [1]:
import json
import pickle
from pathlib import Path

# Standard plotly imports
import plotly.graph_objs as go
import plotly.plotly as py
from plotly.offline import iplot

# Data science imports
import pandas as pd
import numpy as np

# Cufflinks wrapper on plotly
import cufflinks
cufflinks.go_offline()

# Set the global theme for cufflinks
cufflinks.set_config_file(world_readable=True, theme='solar', offline=True)

cur_dir = !pwd
project_dir = Path(cur_dir[0]).resolve().parents[0]
features_path = project_dir / 'src' / 'features'
predictions_path = project_dir / 'reports'
In [42]:
Y_train = pickle.load(open(features_path / 'training_y.pkl', 'rb'))
predictions_train = json.load(open(predictions_path / 'predictions_train.json'))

predictions_train contains results for the following models:

In [43]:
for i, p in enumerate(predictions_train):
    print('%d. %s' % (i, p['name']))
0. ClosedForm_train
1. ClosedForm_160_train
2. ClosedForm_60_train
3. ClosedForm_no_text_train
4. GradientDescent_train
5. GradientDescent_160_train
6. GradientDescent_60_train
7. GradientDescent_no_text_train

The hyperparameters for the GD models above are:

In [44]:
predictions_train[4]['hparams']
Out[44]:
{'w_0': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'beta': 0.0001,
 'eta_0': 1e-05,
 'eps': 1e-06}

Comparing ClosedForm_no_text_train and GradientDescent_no_text_train:

In [45]:
cf_train_no_text = predictions_train[3]
gd_train_no_text = predictions_train[7]

cf_prediction = cf_train_no_text['y_predicted']
gd_prediction = gd_train_no_text['y_predicted']

trace1 = go.Scatter(
    x=[i for i in range(1, 10001)],
    y=[y for [y] in Y_train.tolist()],
    name = 'Target',
    line = dict(
        color='#F9A746',
        width=1),
    connectgaps=True
)
trace2 = go.Scatter(
    x=[i for i in range(1, 10001)],
    y=cf_prediction,
    name = 'Closed form',
    line = dict(
        color='#E85285',
        width=1),
    connectgaps=True
)
trace3 = go.Scatter(
    x=[i for i in range(1, 10001)],
    y=gd_prediction,
    name = 'Gradient descent',
    line = dict(
        color='#6A1B9A',
        width=1),
    connectgaps=True
)
data = [trace1, trace2, trace3]

layout = dict(title='Popularity Prediction on Training Data',
              xaxis=dict(title='Example no.'),
              yaxis=dict(title='Popularity score'),
              plot_bgcolor = '#151516')

fig = dict(data=data, layout=layout)
iplot(fig)

Observations:

  1. In general, closed form predicts higher popularity than gradient descent (higher peak values)
  2. In majority, the closed form solution predicts more accurately than gradient descent
  3. Both models fails to predict more extreme popularity scores i.e. high and low peaks, which implies underfitting and emphasizes the low number of features. This makes sense as we're not currently using text features
In [46]:
cf_error = (np.array([[p] for p in cf_prediction]) - Y_train)**2
gd_error = (np.array([[p] for p in gd_prediction]) - Y_train)**2

trace1 = go.Scatter(
    x=[i for i in range(1, 10001)],
    y=[e for [e] in cf_error.tolist()],
    name = 'Closed form',
    line = dict(
        color='#E85285',
        width=0.5),
    connectgaps=True
)
trace2 = go.Scatter(
    x=[i for i in range(1, 10001)],
    y=[e for [e] in gd_error.tolist()],
    name = 'Gradient descent',
    line = dict(
        color='#6A1B9A',
        width=0.5),
    connectgaps=True
)
data = [trace1, trace2]

layout = dict(title='Squared Error for Each Training Example',
              xaxis=dict(title='Example no.'),
              yaxis=dict(title='Squared error'))

fig = dict(data=data, layout=layout)
iplot(fig)

Observations:

Not much to say about this chart that hasn't been said. It was plotly practice, I guess.

To test the hypothesis that the models are underfitting without text features, we will check theirs accuracy on the validation set:

In [47]:
predictions_validate = json.load(open(predictions_path / 'predictions_validate.json'))
for i, p in enumerate(predictions_validate):
    print('%d. %s' % (i, p['name']))
0. ClosedForm_validate
1. ClosedForm_160_validate
2. ClosedForm_60_validate
3. ClosedForm_no_text_validate
4. GradientDescent_validate
5. GradientDescent_160_validate
6. GradientDescent_60_validate
7. GradientDescent_no_text_validate
In [48]:
cf_validate_no_text = predictions_validate[3]
gd_validate_no_text = predictions_validate[7]

cf_prediction = cf_validate_no_text['y_predicted']
gd_prediction = gd_validate_no_text['y_predicted']

trace1 = go.Scatter(
    x=[i for i in range(1, 1001)],
    y=[y for [y] in Y_train.tolist()],
    name = 'Target',
    line = dict(
        color='#F9A746',
        width=1),
    connectgaps=True
)
trace2 = go.Scatter(
    x=[i for i in range(1, 1001)],
    y=cf_prediction,
    name = 'Closed form',
    line = dict(
        color='#E85285',
        width=1),
    connectgaps=True
)
trace3 = go.Scatter(
    x=[i for i in range(1, 1001)],
    y=gd_prediction,
    name = 'Gradient descent',
    line = dict(
        color='#6A1B9A',
        width=1),
    connectgaps=True
)
data = [trace1, trace2, trace3]

layout = dict(title='Popularity Prediction on Validation Data',
              xaxis=dict(title='Example no.'),
              yaxis=dict(title='Popularity score'),
              plot_bgcolor = '#151516')

fig = dict(data=data, layout=layout)
iplot(fig)

Observations:

Many popularity peaks are not captured by the model, which reinforces the underfitting hypothesis

In [49]:
trace = go.Table(
    header=dict(values=['<b>Set</b>',
                        '<b>Model</b>',
                        '<b>MSE</b>',
                        '<b>Hyperparameters</b>',
                        '<b>Iterations</b>']),
    cells=dict(values=[['Training', 'Training', 'Validation', 'Validation'],
                       ['Closed Form', 'Gradient Descent', 'Closed Form', 'Gradient Descent'],
                       [cf_train_no_text['mse'],
                        gd_train_no_text['mse'],
                        cf_validate_no_text['mse'],
                        gd_validate_no_text['mse']],
                       ['', str(gd_train_no_text['hparams']),
                        '', str(gd_validate_no_text['hparams'])],
                       ['', gd_train_no_text['num_iterations'],
                        '', gd_validate_no_text['num_iterations']]
                      ]))

layout = dict(title='Predictions Without Text Features')
data = [trace] 
iplot(dict(data=data,layout=layout))

Observations:

The summary table seems to suggest there is no underfitting, as the models performed better on the validation set than on the training set. However, it is possible it is due to random sampling and noise.

On stability:

The CF's weight vector is computed from a fixed formula - it will therefore give the same prediction for the same inputs.
On the other hand, GD's prediction depends on the hyperparameters set at training stage - different GD models will thus yield different predictions even for the same inputs.

This means GD is more unstable than CF.

Conclusions:

  1. Overall, the closed form model performed better than gradient descent (better MSE for both sets)
  2. Gradient descent is more unstable than closed form
  3. Closed form has better runtime during training (because of its straightfoward computation), but there is still risk of obtaining singular matrices

2. No Text Features vs 160 Words vs 60 Words

Comparisons will be made using the closed form model.

In [50]:
cf_train_160 = predictions_train[1]
cf_train_60 = predictions_train[2]
cf_validate_160 = predictions_validate[1]
cf_validate_60 = predictions_validate[2]

trace = go.Table(
    header=dict(values=['',
                        '<b>No Text</b>',
                        '<b>Top 60</b>',
                        '<b>Top 160</b>']),
    cells=dict(values=[['<b>Training</b>','<b>Validation</b>'],
                       [cf_train_no_text['mse'], cf_validate_no_text['mse']],
                       [cf_train_60['mse'], cf_validate_60['mse']],
                       [cf_train_160['mse'], cf_validate_160['mse']]
                      ]))

layout = dict(title='Closed Form MSE')
data = [trace] 
iplot(dict(data=data,layout=layout))
In [51]:
cf_y_no_text = cf_validate_no_text['y_predicted']
cf_y_60 = cf_validate_60['y_predicted']
cf_y_160 = cf_validate_160['y_predicted']

trace1 = go.Scatter(
    x=[i for i in range(1, 1001)],
    y=[y for [y] in Y_train.tolist()],
    name = '(white) Target',
    line = dict(
        color='white',
        width=1),
    connectgaps=True
)
trace2 = go.Scatter(
    x=[i for i in range(1, 1001)],
    y=cf_y_no_text,
    name = 'No Text',
    line = dict(
        color='#F9A746',
        width=1),
    connectgaps=True
)
trace3 = go.Scatter(
    x=[i for i in range(1, 1001)],
    y=cf_y_60,
    name = 'Top 60',
    line = dict(
        color='#E85285',
        width=1),
    connectgaps=True
)
trace4 = go.Scatter(
    x=[i for i in range(1, 1001)],
    y=cf_y_160,
    name = 'Top 160',
    line = dict(
        color='#6A1B9A',
        width=1),
    connectgaps=True
)
data = [trace1, trace2, trace3, trace4]

layout = dict(title='Closed Form Predictions on Validation Data',
              xaxis=dict(title='Example no.'),
              yaxis=dict(title='Popularity score'),
              plot_bgcolor = '#151516'
             )

fig = dict(data=data, layout=layout)
iplot(fig)

Observations:

Judging from the MSEs, top-60 is an incremental improvement over no text, and top-160 is an incremental improvement over top-60.
Even though top-160's validation MSE is the only one higher than its training's, it's a small difference (1.0477763217987115 and 1.0686390774956736).
The fact that top-160's validation MSE is lower than no text's is demonstrative that top-60 gives the best predictions out of the three.

Conclusions:

Adding the top 160 words features overall improves our model.

3. New Features: Comment Length and Porter Stemming

We'll vary the complexity of the model to find the best performer.
Specifically, we'll vary the size of the 'stem' vector in the features from 160 to 1.
Ideally, the validation MSE will hit a minimum at a certain degree of freedom, and this is the degree of freedom we will use in our final model.

In [28]:
src_path = project_dir / 'src'

# Workaround for 'from src.models import ClosedForm'
import importlib.util
spec = importlib.util.spec_from_file_location("src.models", str(src_path / 'models' / 'models.py'))
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
ClosedForm = module.ClosedForm
In [37]:
X_train = pickle.load(open(features_path / 'training_X.pkl', 'rb'))
X_validate = pickle.load(open(features_path / 'validation_X.pkl', 'rb'))

"""
Structure of a row of X_train:
index 0: is_root
      1: controversiality
      2: children
      3: length
      4-163: x_counts
      164-323: stem
      324: bias term
"""
stem_start = 164

X_train_list = [
    np.array([ # 1 X matrix
        np.concatenate(( # One row
            X_train[j][:stem_start], # Begining of row
            X_train[j][stem_start:stem_start + i], # Stem
            np.array([1]) # Bias term of row
        ))
        for j in range(X_train.shape[0]) # 10000 rows to create 1 X matrix
    ])
    for i in range(160, 0, -1) # 160 X matrices with varying stem vector sizes
]

X_validate_list = [
    np.array([ # 1 X matrix
        np.concatenate(( # One row
            X_validate[j][:stem_start], # Begining of row
            X_validate[j][stem_start:stem_start + i], # Stem
            np.array([1]) # Bias term of row
        ))
        for j in range(X_validate.shape[0]) # 1000 rows to create 1 X matrix
    ])
    for i in range(160, 0, -1) # 160 X matrices with varying stem vector sizes
]

# Reversing list to get ascending stem vector sizes
X_train_list.reverse()
X_validate_list.reverse()
In [39]:
Y_validate = pickle.load(open(features_path / 'validation_y.pkl', 'rb'))

# The comparison is already saved to an external file.
# If the file isn't there, recreate it
# WARNING: THIS TAKES SO LONG

if (predictions_path / 'complexity_comparison.json').is_file():
    complexity_comparisons = json.load(open(predictions_path / 'complexity_comparison.json'))
else:
    complexity_comparisons = []
    
    for i, (X_train, X_validate) in enumerate(zip(X_train_list, X_validate_list)):    
        model = ClosedForm().train(X_train, Y_train)
        
        complexity_comparisons.append({
            'stem vector size': i + 1,
            'train mse': model.mse(X_train, Y_train),
            'validate mse': model.mse(X_validate, Y_validate),
        })

    json.dump(complexity_comparisons, open(predictions_path / 'complexity_comparison.json', 'w'))
In [40]:
# Create dataframe
df = pd.read_json(json.dumps(complexity_comparisons))

With ~140 stop words removed:

In [27]:
df[['stem vector size',
    'train mse',
    'validate mse']].set_index(
        'stem vector size').iplot(
            mode='lines',
            xTitle='Model Complexity (Stem Vector Size)',
            yTitle='MSE',
            title='MSE Against Model Complexity',
        )

Observations:

Horrible.

With no stop words removed:

In [33]:
df[['stem vector size',
    'train mse',
    'validate mse']].set_index(
        'stem vector size').iplot(
            mode='lines',
            xTitle='Model Complexity (Stem Vector Size)',
            yTitle='MSE',
            title='MSE Against Model Complexity',
        )

Observations:

With stop words, the model performs worse than without e.g. final mse is 1.9 (with stop words) compared to 1.4 (without stop words).

With ~70 stop words removed:

In [41]:
df[['stem vector size',
    'train mse',
    'validate mse']].set_index(
        'stem vector size').iplot(
            mode='lines',
            xTitle='Model Complexity (Stem Vector Size)',
            yTitle='MSE',
            title='MSE Against Model Complexity',
        )

Observations:

3 tries and MSE just gets worse. This feature's a bust.
For the final model, we'll keep it at 42 degrees of freedom with all ~140 stop words removed.

In [52]:
cf_train = predictions_train[0]
cf_validate = predictions_validate[0]

trace = go.Table(
    header=dict(values=['',
                        '<b>Top 160</b>',
                        '<b>Full</b> (top 160, length and stemming)']),
    cells=dict(values=[['<b>Training</b>','<b>Validation</b>'],
                       [cf_train_160['mse'], cf_validate_160['mse']],
                       [cf_train['mse'], cf_validate['mse']]
                      ]))

layout = dict(title='Closed Form MSE')
data = [trace] 
iplot(dict(data=data,layout=layout))

4. Running on Test Set

In [53]:
predictions_test = json.load(open(predictions_path / 'predictions_test.json'))
for i, p in enumerate(predictions_test):
    print('%d. %s' % (i, p['name']))
0. ClosedForm_test
1. ClosedForm_160_test
2. ClosedForm_60_test
3. ClosedForm_no_text_test
4. GradientDescent_test
5. GradientDescent_160_test
6. GradientDescent_60_test
7. GradientDescent_no_text_test
In [54]:
cf_test_160 = predictions_test[1]
cf_test = predictions_test[0]

trace = go.Table(
    header=dict(values=['',
                        '<b>Top 160</b>',
                        '<b>Full</b> (top 160, length and stemming)']),
    cells=dict(values=[['<b>Training</b>','<b>Validation</b>', '<b>Test</b>'],
                       [cf_train_160['mse'], cf_validate_160['mse'], cf_test_160['mse']],
                       [cf_train['mse'], cf_validate['mse'], cf_test['mse']]
                      ]))

layout = dict(title='Closed Form MSE')
data = [trace] 
iplot(dict(data=data,layout=layout))
In [55]:
cf_y_160 = cf_test_160['y_predicted']
cf_y = cf_test['y_predicted']

trace1 = go.Scatter(
    x=[i for i in range(1, 1001)],
    y=[y for [y] in Y_train.tolist()],
    name = 'Target',
    line = dict(
        color='#F9A746',
        width=1),
    connectgaps=True
)
trace2 = go.Scatter(
    x=[i for i in range(1, 1001)],
    y=cf_y,
    name = 'Full',
    line = dict(
        color='#E85285',
        width=1),
    connectgaps=True
)
trace3 = go.Scatter(
    x=[i for i in range(1, 1001)],
    y=cf_y_160,
    name = 'Top 160',
    line = dict(
        color='#6A1B9A',
        width=1),
    connectgaps=True
)
data = [trace1, trace2, trace3]

layout = dict(title='Closed Form Predictions on Test Data',
              xaxis=dict(title='Example no.'),
              yaxis=dict(title='Popularity score'),
              plot_bgcolor = '#151516'
             )

fig = dict(data=data, layout=layout)
iplot(fig)

Observations:

The full model (2 new features) only outperforms the top-160 model with the training set, and is worse in both with the validation and test set

Conclusions:

Our best model overfits and I'm sad.